thiyagab commited on
Commit
83c8bb0
1 Parent(s): 1dfc271

Updated to check for the index search

Browse files
Files changed (1) hide show
  1. semanticsearch.py +34 -26
semanticsearch.py CHANGED
@@ -1,14 +1,7 @@
1
- #Write some lines to encode (sentences 0 and 2 are both ideltical):
2
- sen = [
3
- "Three years later, the coffin was still full of Jello.",
4
- "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
5
- "The person box was packed with jelly many dozens of months later.",
6
- "He found a leprechaun in his walnut shell."
7
- ]
8
-
9
  import json
10
  import numpy
11
  import os
 
12
 
13
  # Opening JSON file
14
  f = open('thirukural_git.json')
@@ -31,32 +24,47 @@ for kural in data['kurals']:
31
  f.close()
32
  from sentence_transformers import SentenceTransformer
33
  model = SentenceTransformer('all-MiniLM-L6-v2')
34
- # model.tokenizer.add_special_tokens({'pad_token':'[thiyaga]'})
35
- #Encoding:
36
-
37
  sen_embeddings = model.encode(en_translations)
38
 
39
  # sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768))
40
  # sen_embeddings.tofile('trainedmodel')
41
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def find_similarities(input:str):
43
- input_embeddings = model.encode([input.lower()])
44
- from sklearn.metrics.pairwise import cosine_similarity
45
- #let's calculate cosine similarity for sentence 0:
46
- similarity_matrix=cosine_similarity(
47
- [input_embeddings[0]],
48
- sen_embeddings[1:]
49
- )
50
 
51
- indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  response=''
53
- for index in indices[0]:
54
- print(similarity_matrix[0][index])
55
- print(en_translations[index+1])
56
- response += "\n".join(kurals[index+1])+"\n"
57
- response += en_translations[index + 1]+"\n\n"
58
- print("\n".join(kurals[index+1]))
59
- return response
60
 
61
  # while True:
62
  # text=input('Ask valluvar: ')
 
 
 
 
 
 
 
 
 
1
  import json
2
  import numpy
3
  import os
4
+ import re
5
 
6
  # Opening JSON file
7
  f = open('thirukural_git.json')
 
24
  f.close()
25
  from sentence_transformers import SentenceTransformer
26
  model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
 
27
  sen_embeddings = model.encode(en_translations)
28
 
29
  # sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768))
30
  # sen_embeddings.tofile('trainedmodel')
31
 
32
+
33
+ def preprocess(input:str):
34
+ if input.startswith('/'):
35
+ #TODO
36
+ return True
37
+ values = [int(s) for s in re.findall(r'-?\d+\.?\d*', input)]
38
+
39
+ if values:
40
+ index=values[0]
41
+ return kural_definition(index)
42
+ else:
43
+ return False
44
  def find_similarities(input:str):
 
 
 
 
 
 
 
45
 
46
+ if(not preprocess(input)):
47
+ input_embeddings = model.encode([input.lower()])
48
+ from sklearn.metrics.pairwise import cosine_similarity
49
+ #let's calculate cosine similarity for sentence 0:
50
+ similarity_matrix=cosine_similarity(
51
+ [input_embeddings[0]],
52
+ sen_embeddings[1:]
53
+ )
54
+
55
+ indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]]
56
+ indices.sort(reverse=True)
57
+ response=''
58
+ for index in indices[0]:
59
+ response+=kural_definition(index)
60
+ return response
61
+
62
+ def kural_definition(index:int):
63
  response=''
64
+ print(en_translations[index + 1])
65
+ response += "\n".join(kurals[index + 1]) + "\n"
66
+ response += en_translations[index + 1] + "\n\n"
67
+ print("\n".join(kurals[index + 1]))
 
 
 
68
 
69
  # while True:
70
  # text=input('Ask valluvar: ')