import json import numpy import os import re # Opening JSON file f = open('thirukural_git.json') # returns JSON object as # a dictionary data = json.load(f) en_translations = [] kurals = [] # Iterating through the json # list for kural in data['kurals']: en_translations.append((kural['meaning']['en'].lower())) kurals.append(kural['kural']) # Closing file f.close() from sentence_transformers import SentenceTransformer model = SentenceTransformer('all-MiniLM-L6-v2') sen_embeddings = model.encode(en_translations) # sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768)) # sen_embeddings.tofile('trainedmodel') def preprocess(input: str): if input.startswith('/'): # TODO return False values = [int(s) for s in re.findall(r'-?\d+\.?\d*', input)] if values: index = values[0] if index > 0: return kural_definition(index - 1) else: return False def find_similarities(input: str): response = preprocess(input) if response: return response input_embeddings = model.encode([input.lower()]) from sklearn.metrics.pairwise import cosine_similarity # let's calculate cosine similarity for sentence 0: similarity_matrix = cosine_similarity( [input_embeddings[0]], sen_embeddings[1:] ) indices = [numpy.argpartition(similarity_matrix[0], -3)[-3:]] indices=sorted(indices[0],key=lambda x:-similarity_matrix[0][x]) response = '' for index in indices: print(similarity_matrix[0][index]) response += kural_definition(index + 1) return response def kural_definition(index: int): response = '' print(en_translations[index]) response += "\n".join(kurals[index]) + "\n" response += en_translations[index] + "\n\n" print("\n".join(kurals[index])) return response while True: text = input('Ask valluvar: ') if (text == 'exit'): break find_similarities(text)