import json import numpy import os import re # Opening JSON file f = open('thirukural_git.json') # returns JSON object as # a dictionary data = json.load(f) en_translations = [] kurals = [] ta_translations=[] # Iterating through the json # list for kural in data['kurals']: en_translations.append((kural['meaning']['en'].lower())) ta_translations.append((kural['meaning']['ta_salamon'].lower())) kurals.append(kural['kural']) # Closing file f.close() from sentence_transformers import SentenceTransformer model = SentenceTransformer('all-MiniLM-L6-v2') sen_embeddings = model.encode(en_translations) # sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768)) # sen_embeddings.tofile('trainedmodel') def preprocess(input: str): if input.startswith('/'): # TODO return False values = [int(s) for s in re.findall(r'-?\d+\.?\d*', input)] if values: index = values[0] if index > 0: return kural_definition(index - 1) else: return False def find_similarities(input: str): try: response = preprocess(input) if response: return response input_embeddings = model.encode([input.lower()]) from sklearn.metrics.pairwise import cosine_similarity # let's calculate cosine similarity for sentence 0: similarity_matrix = cosine_similarity( [input_embeddings[0]], sen_embeddings[1:] ) indices = [numpy.argpartition(similarity_matrix[0], -3)[-3:]] indices=sorted(indices[0],key=lambda x:-similarity_matrix[0][x]) response = '' for index in indices: print(similarity_matrix[0][index]) response += kural_definition(index + 1) return response except: return "Try again with different query" def kural_definition(index: int): response = '' print(en_translations[index]) response += str(index+1)+'. '+"\n".join(kurals[index]) + "\n" response += ta_translations[index]+"\n" response += en_translations[index] + "\n\n" print("\n".join(kurals[index])) return response # while True: # text = input('Ask valluvar: ') # if (text == 'exit'): # break # find_similarities(text)