File size: 9,902 Bytes
d1a829e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import click
from qdrant_client import QdrantClient, models
from openai import OpenAI
from tqdm import tqdm
import json
import requests
import os
from prompts import REVIEWS_SYSTEM_PROMPT, REVIEWS_USER_PROMPT

TRIPADVISOR_API_KEY = os.environ.get('TRIPADVISOR_API_KEY')


def save_json(data, path):
    with open(path, "w") as outfile:
        json.dump(data, outfile)


def get_df(dataset_path, is_hf):
    if is_hf:
        from datasets import load_dataset
        dataset = load_dataset(dataset_path)
        return dataset['train'].to_pandas()
    else:
        import pandas as pd
        return pd.read_csv(dataset_path)


def _concat_reviews(df):
    text = ''
    for _, row in df.iterrows():
        text += '\n'
        if row.review_title:
            text += '\nTitle:\n' + row.review_title
        if row.review_text:
            text += '\nReview:\n' + row.review_text

    return text


def create_reviews_symmary(df, model, hotels, pos_rate=4.0, neg_rate=4.0, n_reviews=6):
    """Create a summary of reviews for each hotel, based on the most positive and most negative reviews.

    Args:
        df (pd.DataFrame): hotels dataset
        model (str): OpenAI model name
        hotels (list): list of hotels to create summaries for
        pos_rate (float): minimum positive rate, inclusive
        neg_rate (float): maximum negative rate, exclusive
        n_reviews (int): number of reviews to consider for each category

    Returns:
        dict: hotel name -> reviews summary
    """
    df['review_text_len'] = df.review_text.str.len().fillna(value=0)
    df['review_title_len'] = df.review_title.str.len().fillna(value=0)

    client = OpenAI()
    hotels_reviews_summary = {}
    for hotel in tqdm(hotels):
        temp = df[df.hotel_name.eq(hotel)]
        temp_pos = temp[temp.rate >= pos_rate].nlargest(n_reviews, 'review_text_len')
        temp_neg = temp[temp.rate < neg_rate].nlargest(n_reviews, 'review_text_len')
        if len(temp_pos) == 0 and len(temp_neg) == 0:
            temp_pos = temp.nlargest(n_reviews, 'review_title_len')

        text = _concat_reviews(temp_pos) + _concat_reviews(temp_neg)

        if text:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": REVIEWS_SYSTEM_PROMPT},
                    {"role": "user", "content": REVIEWS_USER_PROMPT.format(text=text)},
                ]
            )
            hotels_reviews_summary[hotel] = response.choices[0].message.content
        return hotels_reviews_summary


def _get_loc_id(hotel):
    """ Given a hotel name, receive location id.
    In order to get the hotel info, we need to get the location id first.

    Args:
        hotel (str): hotel name

    Returns:
        str: location id
    """
    url = "https://api.content.tripadvisor.com/api/v1/location/search?key={key}&searchQuery={hotel}&category=hotels&language=en"
    headers = {"accept": "application/json"}

    response = requests.get(url.format(hotel=hotel, key=TRIPADVISOR_API_KEY), headers=headers)
    try:
        return response.json()['data'][0]['location_id']
    except Exception as e:
        print(f'{response.status_code=}')
        print(f'{response.text=}')
        print(f'Error: {e}')
        return None


def get_hotel_info(hotel):
    """Get hotel info from TripAdvisor.
        The following information is retrieved using the TripAdvisor API:
            - rank
            - ratings distributions
            - subratings
            - amenities

    Args:
        hotel (str): hotel name

    Returns:
        dict: hotel info
    """
    url = "https://api.content.tripadvisor.com/api/v1/location/{loc_id}/details?key={key}&language=en&currency=USD"
    headers = {"accept": "application/json"}

    loc_id = _get_loc_id(hotel)
    if loc_id is None:
        return None
    response = requests.get(url.format(loc_id=loc_id, key=TRIPADVISOR_API_KEY), headers=headers)
    try:
        response = response.json()
    except Exception as e:
        print(f'{response.status_code=}')
        print(f'{response.text=}')
        print(f'Error: {e}')
        return None
    rank = response['ranking_data'].get('ranking_string')
    reviews_ratings = response.get('review_rating_count')
    subratings = {}
    for d in response['subratings']:
        subratings[response['subratings'][d]['name']] = response['subratings'][d]['value']
    amenities = response.get('amenities', [])
    return dict(
        rank=rank,
        reviews_ratings=reviews_ratings,
        subratings=subratings,
        amenities=amenities,
    )


def get_desc(hotel, data):
    """Create a text description of the hotel based on the retrieved data from TripAdvisor.

    Args:
        hotel (str): hotel name
        data (dict): hotel info
    
    Returns:
        str: hotel text description
    """
    rating = "Rating: "+str(data[hotel]['rank'])+". "

    distr_ranks = "Rating distribution "
    for key in data[hotel]['reviews_ratings'].keys():
        distr_ranks += str(key) + ": " + str(data[hotel]['reviews_ratings'][key] + ", ")
    distr_ranks = distr_ranks[:-2]+". "

    sub_ranks = "Specific ratings: "
    if 'rate_location' in data[hotel]['subratings'].keys():
        sub_ranks += "Location " + data[hotel]['subratings']['rate_location'] + ", "

    if 'rate_sleep' in data[hotel]['subratings'].keys():
        sub_ranks += "Sleep " + data[hotel]['subratings']['rate_sleep'] + ", "
    if 'rate_room' in data[hotel]['subratings'].keys():
        sub_ranks += "Room " + data[hotel]['subratings']['rate_room'] + ", "
    if 'rate_service' in data[hotel]['subratings'].keys():
        sub_ranks += "Service " + data[hotel]['subratings']['rate_service'] + ", "
    if 'rate_cleanliness' in data[hotel]['subratings'].keys():
        sub_ranks += "Cleanliness " + data[hotel]['subratings']['rate_cleanliness']
    sub_ranks += ". "

    amenities = "Amenities available: "
    for i in data[hotel]['amenities']:
        amenities += str(i) + ", "
    amenities = amenities[:-2] + "."

    total_desc = rating + distr_ranks + sub_ranks + amenities
    return total_desc


def get_payload(hotel, df):
    """Create a metadata which will be stored in the database.

    Args:
        hotel (str): hotel name
        df (pd.DataFrame): hotels dataset

    Returns:
        dict: metadata
    """
    temp = df[df.hotel_name.eq(hotel)]
    rating = temp.rating_value.value_counts().index[0]
    city = temp.locality.value_counts().index[0]
    country = temp.country.value_counts().index[0]
    price = temp.price_range.str.split(' ').str[0].value_counts().index[0]
    return dict(
        hotel_name=hotel,
        rating=rating,
        city=city,
        country=country,
        price=price
    )


@click.command()
@click.option('--dataset-path', default='traversaal-ai-hackathon/hotel_datasets', help='Path to the dataset.')
@click.option('--is-hf', is_flag=True, default=True, help='Whether the dataset is in huggingface format, csv otherwise.')
@click.option('--db-path', default='data/db', help='Path to the output database.')
@click.option('--collection-name', default='hotels', help='Name of the collection in the database.')
@click.option('--embeddings-model', default='text-embedding-3-large', help='Name of the model to use for embeddings.')
@click.option('--embeddings-size', default=3072, help='Size of the embeddings.')
@click.option('--reviews-model', default='gpt-3.5-turbo-0125', help='Name of the model to use for reviews summary.')
def create_vector_db(dataset_path, is_hf, db_path, collection_name, embeddings_model, embeddings_size, reviews_model):
    REVIEW_SUMMARIES_PATH = 'reviews_summary.json'
    HOTELS_INFO_PATH = 'hotels_info.json'

    df = get_df(dataset_path, is_hf)

    # Create a collection if it does not exist and filter out hotels that are already in the collection
    qdrant_client = QdrantClient(path=db_path)
    if not qdrant_client.collection_exists(collection_name):
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=embeddings_size, distance=models.Distance.COSINE),
        )
        hotels = df.hotel_name.unique()
    else:
        docs, _ = qdrant_client.scroll(
            collection_name=collection_name,
            limit=1e9,
            with_payload=True,
            with_vectors=False,
        )
        hotels = set(df.hotel_name.unique()) - set([doc.payload['hotel_name'] for doc in docs])
    if len(hotels) == 0:
        return

    # Create reviews summary using OpenAI
    reviews_summary = create_reviews_symmary(df, reviews_model, hotels)
    save_json(reviews_summary, REVIEW_SUMMARIES_PATH)

    # Get hotel info from TripAdvisor
    hotels_info = {}
    for hotel in tqdm(hotels):
        hotels_info[hotel] = get_hotel_info(hotel)
    save_json(hotels_info, HOTELS_INFO_PATH)

    # Create descriptions and payloads for each hotel
    texts = []
    payloads = []
    for hotel in hotels:
        trip_desc_hotel = get_desc(hotel, hotels_info)
        review_hotel = reviews_summary.get(hotel)
        payload = get_payload(hotel, df)
        text = trip_desc_hotel if trip_desc_hotel else ''  + '\n' + review_hotel if review_hotel else ''
        payload['description'] = text
        payloads.append(payload)
        texts.append(text)

    # Create description embeddings and upsert them to the database
    openai_client = OpenAI()
    embeddings = openai_client.embeddings.create(input=texts, model=embeddings_model)
    points = [
        models.PointStruct(
            id=idx,
            vector=data.embedding,
            payload=payload,
        )
        for idx, (data, payload) in enumerate(zip(embeddings.data, payloads))
    ]
    qdrant_client.upsert(collection_name, points)


if __name__ == '__main__':
    create_vector_db()