Spaces:
Runtime error
Runtime error
File size: 6,670 Bytes
cdcef37 92daf4e cdcef37 da96c85 7bc8b9c cdcef37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
# -*- coding: utf-8 -*-
"""[Uma Namboothiripad]Assignment_2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1_sofOjXRDnId49NOup4sdiVS1E_51T-b
Load the dataset below
"""
!pip install -U spacy
#first install the library that would help us use BERT in an easy to use interface
#https://github.com/UKPLab/sentence-transformers/tree/master/sentence_transformers
!pip install -U sentence-transformers
"""I was having issues connecting my csv file to the colab notebook, so I ended up connecting this to my drive"""
import spacy
from datasets import load_dataset
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
print(pd.__version__)
! pip install -q kaggle
! pip install lightgbm
"""Setup Kaggle json credentials"""
from google.colab import files
files.upload()
!mkdir ~/.kaggle/
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets list
!kaggle datasets download -d hamzafarooq50/hotel-listings-and-reviews/HotelListInBarcelona__en2019100120191005.csv
!ls
!python -m spacy download en_core_web_sm
!kaggle datasets download --force -d hamzafarooq50/hotel-listings-and-reviews/hotelReviewsInBarcelona__en2019100120191005.csv
!ls
nlp = spacy.load("en_core_web_sm")
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
import os
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy import displacy
text = """Example text"""
#text = "I really hope that France does not win the World Cup and Morocco makes it to the finals"
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.render(doc, jupyter = True, style="ent")
stopwords = list(STOP_WORDS)
from string import punctuation
punctuation = punctuation+ '\n'
import pandas as pd
import scipy.spatial
import pickle as pkl
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
#embedder = SentenceTransformer('bert-base-nli-mean-tokens')
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
df = df = pd.read_csv('HotelListInBarcelona__en2019100120191005.csv',encoding = "ISO-8859-1")
!kaggle datasets download --force -d hamzafarooq50/hotel-listings-and-reviews
df.head()
df['hotel_name'].value_counts()
df['hotel_name'].drop_duplicates()
df_combined = df.sort_values(['hotel_name']).groupby('hotel_name', sort=False).hotel_features.apply(''.join).reset_index(name='hotel_features')
df_combined.head().T
import re
df_combined['hotel_features'] = df_combined['hotel_features'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
def lower_case(input_str):
input_str = input_str.lower()
return input_str
df_combined['hotel_features']= df_combined['hotel_features'].apply(lambda x: lower_case(x))
df = df_combined
df_sentences = df_combined.set_index("hotel_features")
df_sentences = df_sentences["hotel_name"].to_dict()
df_sentences_list = list(df_sentences.keys())
len(df_sentences_list)
list(df_sentences.keys())[:5]
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
# Corpus with example sentences
corpus = df_sentences_list
corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
corpus_embeddings[0]
queries = ['Hotel near tourist locations and with free WIFI',
]
query_embeddings = embedder.encode(queries,show_progress_bar=True)
import torch
# Query sentences:
queries = ['Hotel at least 10 minutes away from sagrada familia'
]
# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(3, len(corpus))
for query in queries:
query_embedding = embedder.encode(query, convert_to_tensor=True)
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=top_k)
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 3 most similar sentences in corpus:")
for score, idx in zip(top_results[0], top_results[1]):
print("(Score: {:.4f})".format(score))
print(corpus[idx], "(Score: {:.4f})".format(score))
row_dict = df.loc[df['hotel_features']== corpus[idx]]
print("paper_id: " , row_dict['hotel_name'] , "\n")
# for idx, distance in results[0:closest_n]:
# print("Score: ", "(Score: %.4f)" % (1-distance) , "\n" )
# print("Paragraph: ", corpus[idx].strip(), "\n" )
# row_dict = df.loc[df['all_review']== corpus[idx]]
# print("paper_id: " , row_dict['Hotel'] , "\n")
model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
embeddings = model.encode(corpus)
#print(embeddings)
query_embedding.shape
# Query sentences:
queries = ['Hotel at least 10 minutes away from good food',
'quiet'
]
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
query_embedding = model.encode(query, convert_to_tensor=True)
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
top_results = torch.topk(cos_scores, k=top_k)
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")
for score, idx in zip(top_results[0], top_results[1]):
print("(Score: {:.4f})".format(score))
print(corpus[idx], "(Score: {:.4f})".format(score))
row_dict = df.loc[df['hotel_features']== corpus[idx]]
print("paper_id: " , row_dict['hotel_name'] , "\n")
df
hits = util.semantic_search(query_embedding, embeddings, top_k=5)
hits = hits[0] #Get the hits for the first query
for hit in hits:
print (hit)
print("(Score: {:.4f})".format(hit['score']))
print(corpus[hit['corpus_id']])
row_dict = df.loc[df['hotel_features']== corpus[hit['corpus_id']]]
print("paper_id: " , row_dict['hotel_name'] , "\n")
!pip freeze > requirements.txt |