|
import json |
|
import numpy |
|
import os |
|
import re |
|
|
|
|
|
f = open('thirukural_git.json') |
|
|
|
|
|
|
|
data = json.load(f) |
|
|
|
en_translations = [] |
|
kurals = [] |
|
|
|
|
|
for kural in data['kurals']: |
|
en_translations.append((kural['meaning']['en'].lower())) |
|
kurals.append(kural['kural']) |
|
|
|
|
|
f.close() |
|
from sentence_transformers import SentenceTransformer |
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
sen_embeddings = model.encode(en_translations) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def preprocess(input: str): |
|
if input.startswith('/'): |
|
|
|
return False |
|
values = [int(s) for s in re.findall(r'-?\d+\.?\d*', input)] |
|
if values: |
|
index = values[0] |
|
if index > 0: |
|
return kural_definition(index - 1) |
|
else: |
|
return False |
|
|
|
|
|
def find_similarities(input: str): |
|
response = preprocess(input) |
|
if response: |
|
return response |
|
input_embeddings = model.encode([input.lower()]) |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
similarity_matrix = cosine_similarity( |
|
[input_embeddings[0]], |
|
sen_embeddings[1:] |
|
) |
|
indices = [numpy.argpartition(similarity_matrix[0], -3)[-3:]] |
|
indices=sorted(indices[0],key=lambda x:-similarity_matrix[0][x]) |
|
response = '' |
|
for index in indices: |
|
print(similarity_matrix[0][index]) |
|
response += kural_definition(index + 1) |
|
return response |
|
|
|
|
|
def kural_definition(index: int): |
|
response = '' |
|
print(en_translations[index]) |
|
response += "\n".join(kurals[index]) + "\n" |
|
response += en_translations[index] + "\n\n" |
|
print("\n".join(kurals[index])) |
|
return response |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|