Spaces:

thiyagab
/

Thamizh

Build error

Thamizh / semanticsearch.py

thiyagab

Changed to gradio

b10d5a0 almost 2 years ago

1.88 kB

	#Write some lines to encode (sentences 0 and 2 are both ideltical):
	sen = [
	"Three years later, the coffin was still full of Jello.",
	"The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
	"The person box was packed with jelly many dozens of months later.",
	"He found a leprechaun in his walnut shell."
	]

	import json
	import numpy
	import os

	# Opening JSON file
	f = open('thirukural_git.json')

	# returns JSON object as
	# a dictionary
	data = json.load(f)

	en_translations=[]
	kurals=[]
	# Iterating through the json
	# list
	for kural in data['kurals']:
	en_translations.append((kural['meaning']['en'].lower()))
	kurals.append(kural['kural'])



	# Closing file
	f.close()
	from sentence_transformers import SentenceTransformer
	model = SentenceTransformer('all-MiniLM-L6-v2')
	# model.tokenizer.add_special_tokens({'pad_token':'[thiyaga]'})
	#Encoding:

	sen_embeddings = model.encode(en_translations)

	# sen_embeddings= numpy.memmap('trainedmodel',mode="r",dtype=numpy.float32,shape=(1330,768))
	# sen_embeddings.tofile('trainedmodel')

	def find_similarities(input:str):
	input_embeddings = model.encode([input.lower()])
	from sklearn.metrics.pairwise import cosine_similarity
	#let's calculate cosine similarity for sentence 0:
	similarity_matrix=cosine_similarity(
	[input_embeddings[0]],
	sen_embeddings[1:]
	)

	indices=[numpy.argpartition(similarity_matrix[0],-3)[-3:]]
	response=''
	for index in indices[0]:
	print(similarity_matrix[0][index])
	print(en_translations[index+1])
	response += "\n".join(kurals[index+1])+"\n"
	response += en_translations[index + 1]+"\n\n"
	print("\n".join(kurals[index+1]))
	return response

	# while True:
	# text=input('Ask valluvar: ')
	# if( text == 'exit'):
	# break
	# find_similarities(text)