|
import json |
|
import numpy |
|
import os |
|
import re |
|
|
|
|
|
f = open('thirukural_git.json') |
|
|
|
|
|
|
|
data = json.load(f) |
|
|
|
en_translations=[] |
|
kurals=[] |
|
|
|
|
|
for kural in data['kurals']: |
|
en_translations.append((kural['meaning']['en'].lower())) |
|
kurals.append(kural['kural']) |
|
|
|
|
|
|
|
|
|
f.close() |
|
from sentence_transformers import SentenceTransformer |
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
sen_embeddings = model.encode(en_translations) |
|
|
|
|
|
|
|
|
|
|
|
def preprocess(input:str): |
|
if input.startswith('/'): |
|
|
|
return True |
|
values = [int(s) for s in re.findall(r'-?\d+\.?\d*', input)] |
|
|
|
if values: |
|
index=values[0] |
|
return kural_definition(index) |
|
else: |
|
return False |
|
def find_similarities(input:str): |
|
|
|
if(not preprocess(input)): |
|
input_embeddings = model.encode([input.lower()]) |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
similarity_matrix=cosine_similarity( |
|
[input_embeddings[0]], |
|
sen_embeddings[1:] |
|
) |
|
|
|
indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]] |
|
indices.sort(reverse=True) |
|
response='' |
|
for index in indices[0]: |
|
response+=kural_definition(index) |
|
return response |
|
|
|
def kural_definition(index:int): |
|
response='' |
|
print(en_translations[index + 1]) |
|
response += "\n".join(kurals[index + 1]) + "\n" |
|
response += en_translations[index + 1] + "\n\n" |
|
print("\n".join(kurals[index + 1])) |
|
|
|
|
|
|
|
|
|
|
|
|