Spaces:

thiyagab
/

Thamizh

Build error

App Files Files Community

thiyagab commited on Jan 9, 2023

Commit

83c8bb0

•

1 Parent(s): 1dfc271

Updated to check for the index search

Browse files

Files changed (1) hide show

semanticsearch.py +34 -26

semanticsearch.py CHANGED Viewed

@@ -1,14 +1,7 @@
-#Write some lines to encode (sentences 0 and 2 are both ideltical):
-sen = [
-    "Three years later, the coffin was still full of Jello.",
-    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
-    "The person box was packed with jelly many dozens of months later.",
-    "He found a leprechaun in his walnut shell."
-]
 import json
 import numpy
 import os
 # Opening JSON file
 f = open('thirukural_git.json')
@@ -31,32 +24,47 @@ for kural in data['kurals']:
 f.close()
 from sentence_transformers import SentenceTransformer
 model = SentenceTransformer('all-MiniLM-L6-v2')
-# model.tokenizer.add_special_tokens({'pad_token':'[thiyaga]'})
-#Encoding:
 sen_embeddings = model.encode(en_translations)
 # sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768))
 # sen_embeddings.tofile('trainedmodel')
 def find_similarities(input:str):
-    input_embeddings = model.encode([input.lower()])
-    from sklearn.metrics.pairwise import cosine_similarity
-    #let's calculate cosine similarity for sentence 0:
-    similarity_matrix=cosine_similarity(
-        [input_embeddings[0]],
-        sen_embeddings[1:]
-    )
-    indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]]
     response=''
-    for index in indices[0]:
-        print(similarity_matrix[0][index])
-        print(en_translations[index+1])
-        response += "\n".join(kurals[index+1])+"\n"
-        response += en_translations[index + 1]+"\n\n"
-        print("\n".join(kurals[index+1]))
-    return response
 # while True:
 #     text=input('Ask valluvar: ')

 import json
 import numpy
 import os
+import re
 # Opening JSON file
 f = open('thirukural_git.json')
 f.close()
 from sentence_transformers import SentenceTransformer
 model = SentenceTransformer('all-MiniLM-L6-v2')
 sen_embeddings = model.encode(en_translations)
 # sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768))
 # sen_embeddings.tofile('trainedmodel')
+def preprocess(input:str):
+    if input.startswith('/'):
+        #TODO
+        return True
+    values = [int(s) for s in re.findall(r'-?\d+\.?\d*', input)]
+    if values:
+        index=values[0]
+        return kural_definition(index)
+    else:
+       return False
 def find_similarities(input:str):
+    if(not preprocess(input)):
+        input_embeddings = model.encode([input.lower()])
+        from sklearn.metrics.pairwise import cosine_similarity
+        #let's calculate cosine similarity for sentence 0:
+        similarity_matrix=cosine_similarity(
+            [input_embeddings[0]],
+            sen_embeddings[1:]
+        )
+        indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]]
+        indices.sort(reverse=True)
+        response=''
+        for index in indices[0]:
+           response+=kural_definition(index)
+        return response
+def kural_definition(index:int):
     response=''
+    print(en_translations[index + 1])
+    response += "\n".join(kurals[index + 1]) + "\n"
+    response += en_translations[index + 1] + "\n\n"
+    print("\n".join(kurals[index + 1]))
 # while True:
 #     text=input('Ask valluvar: ')