thiyagab commited on
Commit
c6cc78a
1 Parent(s): 3b9fa76

semantic search added

Browse files
Files changed (3) hide show
  1. app.py +8 -2
  2. requirements.txt +4 -0
  3. semanticsearch.py +65 -0
app.py CHANGED
@@ -1,4 +1,10 @@
1
  import streamlit as st
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
1
  import streamlit as st
2
 
3
+ import semanticsearch
4
+
5
+ # x = st.slider('Select a value')
6
+ x=st.text_input('Ask valluvar')
7
+ # st.write(x, 'squared is', x * x)
8
+ response=semanticsearch.find_similarities(x)
9
+ st.text(response)
10
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ sentence-transformers
4
+ sentence_embeddings
semanticsearch.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Write some lines to encode (sentences 0 and 2 are both ideltical):
2
+ sen = [
3
+ "Three years later, the coffin was still full of Jello.",
4
+ "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
5
+ "The person box was packed with jelly many dozens of months later.",
6
+ "He found a leprechaun in his walnut shell."
7
+ ]
8
+
9
+ import json
10
+ import numpy
11
+ import os
12
+
13
+ # Opening JSON file
14
+ f = open('thirukural_git.json')
15
+
16
+ # returns JSON object as
17
+ # a dictionary
18
+ data = json.load(f)
19
+
20
+ en_translations=[]
21
+ kurals=[]
22
+ # Iterating through the json
23
+ # list
24
+ for kural in data['kurals']:
25
+ en_translations.append((kural['meaning']['en'].lower()))
26
+ kurals.append(kural['kural'])
27
+
28
+
29
+
30
+ # Closing file
31
+ f.close()
32
+ from sentence_transformers import SentenceTransformer
33
+ model = SentenceTransformer('all-MiniLM-L6-v2')
34
+ # model.tokenizer.add_special_tokens({'pad_token':'[thiyaga]'})
35
+ #Encoding:
36
+
37
+ sen_embeddings = model.encode(en_translations)
38
+
39
+ # sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768))
40
+ # sen_embeddings.tofile('trainedmodel')
41
+
42
+ def find_similarities(input:str):
43
+ input_embeddings = model.encode([input.lower()])
44
+ from sklearn.metrics.pairwise import cosine_similarity
45
+ #let's calculate cosine similarity for sentence 0:
46
+ similarity_matrix=cosine_similarity(
47
+ [input_embeddings[0]],
48
+ sen_embeddings[1:]
49
+ )
50
+
51
+ indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]]
52
+ response=''
53
+ for index in indices[0]:
54
+ print(similarity_matrix[0][index])
55
+ response+=en_translations[index+1]
56
+ print(en_translations[index+1])
57
+ response += "\n".join(kurals[index+1])
58
+ print("\n".join(kurals[index+1]))
59
+ return response
60
+
61
+ # while True:
62
+ # text=input('Ask valluvar: ')
63
+ # if( text == 'exit'):
64
+ # break
65
+ # find_similarities(text)