File size: 6,670 Bytes
cdcef37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92daf4e
cdcef37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da96c85
7bc8b9c
 
cdcef37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# -*- coding: utf-8 -*-
"""[Uma Namboothiripad]Assignment_2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1_sofOjXRDnId49NOup4sdiVS1E_51T-b

Load the dataset below
"""

!pip install -U spacy
#first install the library that would help us use BERT in an easy to use interface
#https://github.com/UKPLab/sentence-transformers/tree/master/sentence_transformers
!pip install -U sentence-transformers

"""I was having issues connecting my csv file to the colab notebook, so I ended up connecting this to my drive"""

import spacy
from datasets import load_dataset
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
print(pd.__version__)

! pip install -q kaggle

! pip install lightgbm

"""Setup Kaggle json credentials"""

from google.colab import files
files.upload()

!mkdir ~/.kaggle/
!cp kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets list

!kaggle datasets download -d hamzafarooq50/hotel-listings-and-reviews/HotelListInBarcelona__en2019100120191005.csv

!ls

!python -m spacy download en_core_web_sm

!kaggle datasets download --force -d hamzafarooq50/hotel-listings-and-reviews/hotelReviewsInBarcelona__en2019100120191005.csv

!ls

nlp = spacy.load("en_core_web_sm")

import re

import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

nltk.download('stopwords')

from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

import os
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy import displacy

text = """Example text"""
#text = "I really hope that France does not win the World Cup and Morocco makes it to the finals"
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.render(doc, jupyter = True, style="ent")

stopwords = list(STOP_WORDS)
from string import punctuation
punctuation = punctuation+ '\n'

import pandas as pd

import scipy.spatial
import pickle as pkl

!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
#embedder = SentenceTransformer('bert-base-nli-mean-tokens')

!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('all-MiniLM-L6-v2')
embedder = SentenceTransformer('bert-base-nli-mean-tokens')


df = df = pd.read_csv('HotelListInBarcelona__en2019100120191005.csv',encoding = "ISO-8859-1")


!kaggle datasets download --force -d hamzafarooq50/hotel-listings-and-reviews

df.head()

df['hotel_name'].value_counts()

df['hotel_name'].drop_duplicates()

df_combined = df.sort_values(['hotel_name']).groupby('hotel_name', sort=False).hotel_features.apply(''.join).reset_index(name='hotel_features')

df_combined.head().T

import re

df_combined['hotel_features'] = df_combined['hotel_features'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

def lower_case(input_str):
    input_str = input_str.lower()
    return input_str

df_combined['hotel_features']= df_combined['hotel_features'].apply(lambda x: lower_case(x))

df = df_combined

df_sentences = df_combined.set_index("hotel_features")
df_sentences = df_sentences["hotel_name"].to_dict()
df_sentences_list = list(df_sentences.keys())
len(df_sentences_list)

list(df_sentences.keys())[:5]

df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]

# Corpus with example sentences
corpus = df_sentences_list
corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)

corpus_embeddings[0]

queries = ['Hotel near tourist locations and with free WIFI',
           ]
query_embeddings = embedder.encode(queries,show_progress_bar=True)

import torch
# Query sentences:
queries = ['Hotel at least 10 minutes away from sagrada familia'
           ]


# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(3, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 3 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print("(Score: {:.4f})".format(score))
        print(corpus[idx], "(Score: {:.4f})".format(score))
        row_dict = df.loc[df['hotel_features']== corpus[idx]]
        print("paper_id:  " , row_dict['hotel_name'] , "\n")
    # for idx, distance in results[0:closest_n]:
    #     print("Score:   ", "(Score: %.4f)" % (1-distance) , "\n" )
    #     print("Paragraph:   ", corpus[idx].strip(), "\n" )
    #     row_dict = df.loc[df['all_review']== corpus[idx]]
    #     print("paper_id:  " , row_dict['Hotel'] , "\n")

model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
embeddings = model.encode(corpus)
#print(embeddings)

query_embedding.shape

# Query sentences:
queries = ['Hotel at least 10 minutes away from good food',
           'quiet'
           ]


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print("(Score: {:.4f})".format(score))
        print(corpus[idx], "(Score: {:.4f})".format(score))
        row_dict = df.loc[df['hotel_features']== corpus[idx]]
        print("paper_id:  " , row_dict['hotel_name'] , "\n")

df

hits = util.semantic_search(query_embedding, embeddings, top_k=5)
hits = hits[0]      #Get the hits for the first query
for hit in hits:
  print (hit)
  print("(Score: {:.4f})".format(hit['score']))
  print(corpus[hit['corpus_id']])
  row_dict = df.loc[df['hotel_features']== corpus[hit['corpus_id']]]
  print("paper_id:  " , row_dict['hotel_name'] , "\n")

!pip freeze > requirements.txt