File size: 5,520 Bytes
377ed3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84bf07e
377ed3a
 
 
 
84bf07e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import re
import requests
import json
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from database import execute_query
from aksharamukha import transliterate
from sentence_transformers import util
from llama_index.embeddings.nomic import NomicEmbedding

nomic_api_key = os.getenv('NOMIC_API_KEY')
#nomic embed model used for similarity scores
nomic_embed_model = NomicEmbedding(
    api_key=nomic_api_key,
    dimensionality=128,
    model_name="nomic-embed-text-v1.5",
)



def get_list_meaning_word(word):
    pada_meanings = {'pada': word,
                     'Monier-Williams Sanskrit-English Dictionary (1899)': [],
                     'Shabda-Sagara (1900)': [],
                     'Apte-Practical Sanskrit-English Dictionary (1890)': [],
                     }
    url = f"https://ambuda.org/tools/dictionaries/mw,shabdasagara,apte/{word}"

    try:
        # Fetch HTML content
        response = requests.get(url)
        response.raise_for_status()

        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extracting text from different tags
        divs = soup.find_all('div', class_='my-4', attrs={'x-show': 'show'})
        
        try:
            # Find all list items <li> within the specified <ul> tag
            div_items_0 = divs[0].find('ul').find_all('li', class_='dict-entry mw-entry')
            # Print the text content of each list item
            dive_text_0 = [li_tag.get_text(strip=True) for li_tag in div_items_0]
            text_0_trans = [transliterate.process(src='Devanagari', tgt='IAST', txt=text) for text in dive_text_0]
            pada_meanings['Monier-Williams Sanskrit-English Dictionary (1899)'] = text_0_trans
        except :
            print("Error: Unable to find Monier-Williams Sanskrit-English Dictionary (1899) data.")

        try:
            div_items_1 = divs[1].find_all('div')
            dive_text_1 = [item.get_text(strip=True) for item in div_items_1]
            text_1_trans = [transliterate.process(src='Devanagari', tgt='IAST', txt=text) for text in dive_text_1]
            pada_meanings['Shabda-Sagara (1900)'] = text_1_trans
        except :
            print("Error: Unable to find Shabda-Sagara (1900) data.")

        try:
            apte_meanings = []
            for tag in divs[2].find_all('b'):
                if tag.text.strip() != '—':
                    text1 = tag.text.strip()  # English text within <b> tag
                    sibling = tag.find_next_sibling()  # Text following <b> tag
                    text2 = tag.next_sibling.strip() + ' ' # English text following <b> tag
                    while sibling.name != 'div':
                        if sibling.name is None:  # Handling non-tag text
                            text2 += " "
                        elif sibling.name == 'span':  # Handling <b> tag
                            IAST_text = transliterate.process(src='Devanagari', tgt='IAST', txt=sibling.text.strip()) 
                            text2 += IAST_text  +  ' ' + sibling.next_sibling.strip()
                        else:
                            text2 += sibling.text.strip() +  ' ' +  sibling.next_sibling.strip()
                        sibling = sibling.find_next_sibling()
                    apte_meanings.append(text2)
            pada_meanings['Apte-Practical Sanskrit-English Dictionary (1890)'] = apte_meanings[:-1]
        except:
            print("Error: Unable to find Apte-Practical Sanskrit-English Dictionary (1890) data.")

    except requests.exceptions.RequestException as e:
        print(f"Error: Failed to fetch data from {url}. {e}")

    return pada_meanings

#get similarity scores
def word_sentence_similarity(meanings, root_stem_word):
    # Check if the word embeddings are not empty
    if not meanings or not root_stem_word:
        return None
    
    meaning_embedding = np.array(nomic_embed_model.get_text_embedding(meanings))
    all_meanings = []
    word_score_pair = []
    all_meanings.extend(get_list_meaning_word(root_stem_word)['Monier-Williams Sanskrit-English Dictionary (1899)'])
    all_meanings.extend(get_list_meaning_word(root_stem_word)['Shabda-Sagara (1900)'])
    for word_meaning in all_meanings:
        root_stem_word_meaning_embedding = np.array(nomic_embed_model.get_text_embedding(word_meaning))
        # Calculate cosine similarity
        similarity_score = util.pytorch_cos_sim(meaning_embedding, root_stem_word_meaning_embedding).item()
        word_score_pair.append((word_meaning,similarity_score))
    # Sort the list in descending order based on similarity scores
    sorted_word_score_pairs = sorted(word_score_pair, key=lambda x: x[1], reverse=True)
    return sorted_word_score_pairs

#extract the adhibautic meaning of the mantra from the vedamantra
def extract_meaning_by_language(data_list, target_language='English'):
    for data_dict in data_list:
        if data_dict.get('languageName') == target_language:
            return data_dict.get('mahatma', {})
    return None  

#mantra_json_details
def get_details_mantra_json(query):
    description, data = execute_query(query)
    print(data)
    df = pd.DataFrame(data)
    df.columns = [x[0] for x in description]
    mantra_json = df['mantra_json'].values[0]
    cleaned_data = re.sub('<[^<]+?>', '', mantra_json)
    return json.loads(cleaned_data)

def iast_process(input_text):
    output_text = re.sub('[\u0951-\u0954,\u200d,\u0331]', '', input_text)
    return output_text