File size: 3,983 Bytes
2980408
 
 
 
 
31ca135
a9ea810
31ca135
1da5a81
 
9ddaca4
 
 
31ca135
9ddaca4
 
a9ea810
9ddaca4
 
 
 
 
 
7055aff
1da5a81
 
5cd4f42
1da5a81
 
 
86e6300
1da5a81
 
 
 
5fd9368
a9ea810
 
aaa1f66
81f8cdc
18e6a04
 
a9baa59
 
3a9bd63
81f8cdc
9c9f12f
81f8cdc
 
 
 
 
 
 
 
 
 
6195c27
2980408
 
 
81f8cdc
2980408
 
 
 
 
 
 
5fd9368
f5e51e0
81f8cdc
 
cc21469
6b774a8
18e6a04
 
 
6b774a8
18e6a04
f2889c6
 
c84f2f8
bd01a7f
 
cc21469
f2889c6
 
a657bab
3a2d3ef
2980408
8c2e21c
2980408
3a2d3ef
 
 
2980408
92012a0
f2889c6
92012a0
2980408
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import openai
import pandas as pd 
import numpy as np
import csv
import os
from datasets import load_dataset
openai.api_key= os.environ.get("openai.api_key")
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
import requests
model_id = "sentence-transformers/all-MiniLM-L6-v2"
import json
hf_token = os.environ.get("hf_token")
import re
from sklearn.metrics.pairwise import cosine_similarity

def generate_embeddings(texts, model_id, hf_token):
    api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
    headers = {"Authorization": f"Bearer {hf_token}"}
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    embeddings = response.json()
    return embeddings
AP_Bio = load_dataset('vjain/biology_AP_embeddings')
df1 = pd.DataFrame(AP_Bio['train'])
df1["similarity"] = 0

AP_Physics = load_dataset('vjain/AP_physics_embeddings')
df2 = pd.DataFrame(AP_Physics['train'])
df2["similarity"] = 0

dataframes = {
    "AP_Bio": df1,
    "AP_Physics": df2
}

#df = pd.read_csv("TA_embeddings.csv")
#df["embedding"]=df["embedding"].apply(eval).apply(np.array)
def reply(input, dataset_name):
    try:
        if dataset_name not in dataframes:
            return "Invalid dataset selected. Please select a valid dataset."
        if not input:
            return "Please Enter a Question to get an Answer"
        df = dataframes[dataset_name]
        input = input
        input_vector = generate_embeddings(input, model_id,hf_token)
        df["similarities"]=df["embedding"].apply(lambda x: cosine_similarity([x],[input_vector])[0][0])
        data = df.sort_values("similarities", ascending=False).head(10)
        data.to_csv("sorted.csv")
        context = []
        for i, row in data.iterrows():
            context.append(row['text'])
        context
        text = "\n".join(context)
        context = text
        prompt = f"""
                Answer the following question using the context given below.If you don't know the answer for certain, say I don't know.
                Context: {context}
                Q: {input}
                """      
        response= openai.Completion.create(
                    prompt=prompt,
                    temperature=1,
                    max_tokens=500,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0,
                    model="text-davinci-003"
                )["choices"][0]["text"].strip(" \n")
        return response
    except Exception as e:
        return f"An error occurred: {e}"

csv_dropdown = gr.inputs.Dropdown(
    label="Select the Book",
    choices=["AP_Bio", "AP_Physics"],
    default="AP_Bio"
            
)
input_text = gr.inputs.Textbox(
    label="Enter your questions here",
    placeholder="E.g. What is DNA?",
    lines=3
    
)
text_output = gr.outputs.Textbox(label="Answer")

description = "Scholar Bot is a question answering system designed to provide accurate and relevant answers to questions from this book hosted by OpenStax https://openstax.org/details/books/biology-ap-courses. Simply enter your question in the text box above and Scholar Bot will use advanced natural language processing algorithms to search a large corpus of biology text to find the best answer for you. Scholar Bot uses the Sentence Transformers model to generate embeddings of text, and OpenAI's GPT-3 language model to provide answers to your questions."

ui = gr.Interface(fn=reply,
                  inputs=[input_text, csv_dropdown],
                  outputs=[text_output],
                  title="Scholar Bot",
                  description=description,
                  theme="light",
                  layout="vertical",
                  allow_flagging=False,
                  examples=[["What is the function of DNA polymerase?", "AP_Bio"]]
                )

ui.launch()