Spaces:
Sleeping
Sleeping
import os | |
import re | |
import requests | |
import json | |
import numpy as np | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from database import execute_query | |
from aksharamukha import transliterate | |
from sentence_transformers import util | |
from llama_index.embeddings.nomic import NomicEmbedding | |
nomic_api_key = os.getenv('NOMIC_API_KEY') | |
#nomic embed model used for similarity scores | |
nomic_embed_model = NomicEmbedding( | |
api_key=nomic_api_key, | |
dimensionality=128, | |
model_name="nomic-embed-text-v1.5", | |
) | |
def get_list_meaning_word(word): | |
pada_meanings = {'pada': word, | |
'Monier-Williams Sanskrit-English Dictionary (1899)': [], | |
'Shabda-Sagara (1900)': [], | |
'Apte-Practical Sanskrit-English Dictionary (1890)': [], | |
} | |
url = f"https://ambuda.org/tools/dictionaries/mw,shabdasagara,apte/{word}" | |
try: | |
# Fetch HTML content | |
response = requests.get(url) | |
response.raise_for_status() | |
# Parse HTML with BeautifulSoup | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extracting text from different tags | |
divs = soup.find_all('div', class_='my-4', attrs={'x-show': 'show'}) | |
try: | |
# Find all list items <li> within the specified <ul> tag | |
div_items_0 = divs[0].find('ul').find_all('li', class_='dict-entry mw-entry') | |
# Print the text content of each list item | |
dive_text_0 = [li_tag.get_text(strip=True) for li_tag in div_items_0] | |
text_0_trans = [transliterate.process(src='Devanagari', tgt='IAST', txt=text) for text in dive_text_0] | |
pada_meanings['Monier-Williams Sanskrit-English Dictionary (1899)'] = text_0_trans | |
except : | |
print("Error: Unable to find Monier-Williams Sanskrit-English Dictionary (1899) data.") | |
try: | |
div_items_1 = divs[1].find_all('div') | |
dive_text_1 = [item.get_text(strip=True) for item in div_items_1] | |
text_1_trans = [transliterate.process(src='Devanagari', tgt='IAST', txt=text) for text in dive_text_1] | |
pada_meanings['Shabda-Sagara (1900)'] = text_1_trans | |
except : | |
print("Error: Unable to find Shabda-Sagara (1900) data.") | |
try: | |
apte_meanings = [] | |
for tag in divs[2].find_all('b'): | |
if tag.text.strip() != 'β': | |
text1 = tag.text.strip() # English text within <b> tag | |
sibling = tag.find_next_sibling() # Text following <b> tag | |
text2 = tag.next_sibling.strip() + ' ' # English text following <b> tag | |
while sibling.name != 'div': | |
if sibling.name is None: # Handling non-tag text | |
text2 += " " | |
elif sibling.name == 'span': # Handling <b> tag | |
IAST_text = transliterate.process(src='Devanagari', tgt='IAST', txt=sibling.text.strip()) | |
text2 += IAST_text + ' ' + sibling.next_sibling.strip() | |
else: | |
text2 += sibling.text.strip() + ' ' + sibling.next_sibling.strip() | |
sibling = sibling.find_next_sibling() | |
apte_meanings.append(text2) | |
pada_meanings['Apte-Practical Sanskrit-English Dictionary (1890)'] = apte_meanings[:-1] | |
except: | |
print("Error: Unable to find Apte-Practical Sanskrit-English Dictionary (1890) data.") | |
except requests.exceptions.RequestException as e: | |
print(f"Error: Failed to fetch data from {url}. {e}") | |
return pada_meanings | |
#get similarity scores | |
def word_sentence_similarity(meanings, root_stem_word): | |
# Check if the word embeddings are not empty | |
if not meanings or not root_stem_word: | |
return None | |
meaning_embedding = np.array(nomic_embed_model.get_text_embedding(meanings)) | |
all_meanings = [] | |
word_score_pair = [] | |
all_meanings.extend(get_list_meaning_word(root_stem_word)['Monier-Williams Sanskrit-English Dictionary (1899)']) | |
all_meanings.extend(get_list_meaning_word(root_stem_word)['Shabda-Sagara (1900)']) | |
for word_meaning in all_meanings: | |
root_stem_word_meaning_embedding = np.array(nomic_embed_model.get_text_embedding(word_meaning)) | |
# Calculate cosine similarity | |
similarity_score = util.pytorch_cos_sim(meaning_embedding, root_stem_word_meaning_embedding).item() | |
word_score_pair.append((word_meaning,similarity_score)) | |
# Sort the list in descending order based on similarity scores | |
sorted_word_score_pairs = sorted(word_score_pair, key=lambda x: x[1], reverse=True) | |
return sorted_word_score_pairs | |
#extract the adhibautic meaning of the mantra from the vedamantra | |
def extract_meaning_by_language(data_list, target_language='English'): | |
for data_dict in data_list: | |
if data_dict.get('languageName') == target_language: | |
return data_dict.get('mahatma', {}) | |
return None | |
#mantra_json_details | |
def get_details_mantra_json(query): | |
description, data = execute_query(query) | |
print(data) | |
df = pd.DataFrame(data) | |
df.columns = [x[0] for x in description] | |
mantra_json = df['mantra_json'].values[0] | |
cleaned_data = re.sub('<[^<]+?>', '', mantra_json) | |
return json.loads(cleaned_data) | |
def iast_process(input_text): | |
output_text = re.sub('[\u0951-\u0954,\u200d,\u0331]', '', input_text) | |
return output_text |