File size: 2,311 Bytes
ffbadc4
e85ca86
c4baae7
69e5f39
ffbadc4
fc50127
e8569d3
 
 
 
 
 
 
 
5f6349c
b5392ae
b64bcd9
5f6349c
 
 
 
50a6b52
 
5f6349c
f33586f
 
 
 
 
45fde58
50a6b52
ab70a4d
 
45fde58
e8569d3
 
69e5f39
 
 
 
45fde58
b64bcd9
b5392ae
 
 
b64bcd9
7b6cfde
6d63854
b5392ae
ffbadc4
c9b1232
 
 
615cf43
c9b1232
ffbadc4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
from sentence_transformers import SentenceTransformer
import pandas as pd
import pickle

def find(query):
    def get_detailed_instruct(task_description: str, query: str) -> str:
        return f'Instruct: {task_description}\nQuery: {query}'
    
    # Each query must come with a one-sentence instruction that describes the task
    task = 'Given a web search query, retrieve relevant passages that answer the query'
    queries = [
        get_detailed_instruct(task, query)
    ]
    print("cekpoin0\n")
    
    quran = pd.read_csv('quran-simple-clean.txt', delimiter="|")
    
    file = open('quran-splitted.sav','rb')
    quran_splitted = pickle.load(file)
    
    model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')
    
    documents = quran_splitted['text'].tolist()
    # document_embeddings = model.encode(documents, convert_to_tensor=True, normalize_embeddings=True)
    # filename = 'encoded_quran_text_split_multilingual-e5-large-instruct.sav'
    # pickle.dump(embeddings, open(filename, 'wb'))
    file = open('encoded_quran_text_split_multilingual-e5-large-instructs.sav','rb')
    document_embeddings = pickle.load(file)
    print("cekpoin1\n")
    
    query_embeddings = model.encode(queries, convert_to_tensor=True, normalize_embeddings=True)
    scores = (query_embeddings @ document_embeddings.T) * 100
    print("cekpoin2\n")

    # insert the similarity value to dataframe & sort it
    file = open('quran-splitted.sav','rb')
    quran_splitted = pickle.load(file)
    quran_splitted['similarity'] = scores.tolist()[0]
    sorted_quran = quran_splitted.sort_values(by='similarity', ascending=False)
    print("cekpoin3\n")
    
    results = ""
    i = 0
    while i<6:
        result = sorted_quran.iloc[i]
        result_quran = quran.loc[(quran['sura']==result['sura']) & (quran['aya']==result['aya'])]
        results = results + result_quran['text'].item()+" (Q.S "+str(result['sura']).rstrip('.0')+":"+str(result['aya']).rstrip('.0')+")\n"
        i=i+1

    return sorted_quran
    #return results
    
demo = gr.Interface(fn=find, inputs="textbox", outputs=gr.Dataframe(headers=['sura', 'aya', 'similarity', 'text']))
#demo = gr.Interface(fn=find, inputs="textbox", outputs="textbox")
    
if __name__ == "__main__":
    demo.launch()