thiyagab
commited on
Commit
•
c6cc78a
1
Parent(s):
3b9fa76
semantic search added
Browse files- app.py +8 -2
- requirements.txt +4 -0
- semanticsearch.py +65 -0
app.py
CHANGED
@@ -1,4 +1,10 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
+
import semanticsearch
|
4 |
+
|
5 |
+
# x = st.slider('Select a value')
|
6 |
+
x=st.text_input('Ask valluvar')
|
7 |
+
# st.write(x, 'squared is', x * x)
|
8 |
+
response=semanticsearch.find_similarities(x)
|
9 |
+
st.text(response)
|
10 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
transformers
|
3 |
+
sentence-transformers
|
4 |
+
sentence_embeddings
|
semanticsearch.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Write some lines to encode (sentences 0 and 2 are both ideltical):
|
2 |
+
sen = [
|
3 |
+
"Three years later, the coffin was still full of Jello.",
|
4 |
+
"The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
|
5 |
+
"The person box was packed with jelly many dozens of months later.",
|
6 |
+
"He found a leprechaun in his walnut shell."
|
7 |
+
]
|
8 |
+
|
9 |
+
import json
|
10 |
+
import numpy
|
11 |
+
import os
|
12 |
+
|
13 |
+
# Opening JSON file
|
14 |
+
f = open('thirukural_git.json')
|
15 |
+
|
16 |
+
# returns JSON object as
|
17 |
+
# a dictionary
|
18 |
+
data = json.load(f)
|
19 |
+
|
20 |
+
en_translations=[]
|
21 |
+
kurals=[]
|
22 |
+
# Iterating through the json
|
23 |
+
# list
|
24 |
+
for kural in data['kurals']:
|
25 |
+
en_translations.append((kural['meaning']['en'].lower()))
|
26 |
+
kurals.append(kural['kural'])
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
# Closing file
|
31 |
+
f.close()
|
32 |
+
from sentence_transformers import SentenceTransformer
|
33 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
34 |
+
# model.tokenizer.add_special_tokens({'pad_token':'[thiyaga]'})
|
35 |
+
#Encoding:
|
36 |
+
|
37 |
+
sen_embeddings = model.encode(en_translations)
|
38 |
+
|
39 |
+
# sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768))
|
40 |
+
# sen_embeddings.tofile('trainedmodel')
|
41 |
+
|
42 |
+
def find_similarities(input:str):
|
43 |
+
input_embeddings = model.encode([input.lower()])
|
44 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
45 |
+
#let's calculate cosine similarity for sentence 0:
|
46 |
+
similarity_matrix=cosine_similarity(
|
47 |
+
[input_embeddings[0]],
|
48 |
+
sen_embeddings[1:]
|
49 |
+
)
|
50 |
+
|
51 |
+
indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]]
|
52 |
+
response=''
|
53 |
+
for index in indices[0]:
|
54 |
+
print(similarity_matrix[0][index])
|
55 |
+
response+=en_translations[index+1]
|
56 |
+
print(en_translations[index+1])
|
57 |
+
response += "\n".join(kurals[index+1])
|
58 |
+
print("\n".join(kurals[index+1]))
|
59 |
+
return response
|
60 |
+
|
61 |
+
# while True:
|
62 |
+
# text=input('Ask valluvar: ')
|
63 |
+
# if( text == 'exit'):
|
64 |
+
# break
|
65 |
+
# find_similarities(text)
|