eljanmahammadli commited on
Commit
caa635d
1 Parent(s): c38b78d

changed similarity to sentence transformers

Browse files
__pycache__/utils.cpython-311.pyc ADDED
Binary file (13.7 kB). View file
 
requirements.txt CHANGED
@@ -20,4 +20,5 @@ spacy
20
  textstat
21
  plotly
22
  tqdm
23
- pymupdf
 
 
20
  textstat
21
  plotly
22
  tqdm
23
+ pymupdf
24
+ sentence-transformers
utils.py CHANGED
@@ -9,10 +9,12 @@ from collections import Counter
9
  import numpy as np
10
  import asyncio
11
  import nltk
 
12
 
13
  nltk.download('punkt')
14
 
15
  WORD = re.compile(r"\w+")
 
16
 
17
 
18
  # returns cosine similarity of two vectors
@@ -53,6 +55,13 @@ def cosineSim(text1, text2):
53
  cosine = get_cosine(vector1, vector2)
54
  return cosine
55
 
 
 
 
 
 
 
 
56
  def get_soup_requests(url):
57
  page = requests.get(url)
58
  if page.status_code == 200:
@@ -130,7 +139,7 @@ def googleSearch(
130
  urlList.append(url)
131
  scoreArray.append([0] * len(sentences))
132
  urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
133
- scoreArray[urlList.index(url)][i] = cosineSim(
134
  sentence, snippet
135
  )
136
  else:
 
9
  import numpy as np
10
  import asyncio
11
  import nltk
12
+ from sentence_transformers import SentenceTransformer, util
13
 
14
  nltk.download('punkt')
15
 
16
  WORD = re.compile(r"\w+")
17
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
18
 
19
 
20
  # returns cosine similarity of two vectors
 
55
  cosine = get_cosine(vector1, vector2)
56
  return cosine
57
 
58
+ def sentence_similarity(text1, text2):
59
+ embedding_1= model.encode(text1, convert_to_tensor=True)
60
+ embedding_2 = model.encode(text2, convert_to_tensor=True)
61
+
62
+ o = util.pytorch_cos_sim(embedding_1, embedding_2)
63
+ return round(o.item(), 2)
64
+
65
  def get_soup_requests(url):
66
  page = requests.get(url)
67
  if page.status_code == 200:
 
139
  urlList.append(url)
140
  scoreArray.append([0] * len(sentences))
141
  urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
142
+ scoreArray[urlList.index(url)][i] = sentence_similarity(
143
  sentence, snippet
144
  )
145
  else: