File size: 8,261 Bytes
88da1f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import gradio as gr
import os
import time
import pandas as pd


from langchain.document_loaders import OnlinePDFLoader #for laoding the pdf
from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
from langchain.vectorstores import Chroma # for the vectorization part
from langchain.chains import RetrievalQA # for conversing with chatGPT
from langchain.chat_models import ChatOpenAI # the LLM model we'll use (ChatGPT)
from langchain import PromptTemplate

def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
    if openai_key is not None:
        os.environ['OPENAI_API_KEY'] = open_ai_key
        #Load the pdf file
        loader = OnlinePDFLoader(pdf_doc.name)
        pages = loader.load_and_split()
        
        #Create an instance of OpenAIEmbeddings, which is responsible for generating embeddings for text
        embeddings = OpenAIEmbeddings()

        pages_to_be_loaded =[]

        if relevant_pages:
            page_numbers = relevant_pages.split(",")
            if len(page_numbers) != 0:
                for page_number in page_numbers:
                    if page_number.isdigit():
                        pageIndex = int(page_number)-1
                        if pageIndex >=0 and pageIndex <len(pages):
                            pages_to_be_loaded.append(pages[pageIndex])
        #In the scenario where none of the page numbers supplied exist in the PDF, we will revert to using the entire PDF.
        if len(pages_to_be_loaded) ==0:
            pages_to_be_loaded = pages.copy()
            
             
        #To create a vector store, we use the Chroma class, which takes the documents (pages in our case) and the embeddings instance
        vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
        
        #Finally, we create the bot using the RetrievalQA class
        global pdf_qa

        prompt_template = """Use the following pieces of context to answer the question at the end. If you do not know the answer, just return N/A. If you encounter a date, return it in mm/dd/yyyy format.

        {context}

        Question: {question}
        Return just the answer :"""
        PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
        chain_type_kwargs = {"prompt": PROMPT}
        pdf_qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, model_name="gpt-4"),chain_type="stuff", retriever=vectordb.as_retriever(search_kwargs={"k": 5}), chain_type_kwargs=chain_type_kwargs, return_source_documents=False)
                
        return "Ready"
    else:
        return "Please provide an OpenAI gpt-4 API key"
        

def answer_predefined_questions(document_type):
    
    if document_type == "Deed of Trust":
        #Create a list of questions around the relevant fields of a Deed of Trust(DOT) document
        query1 = "what is the Loan Number?"
        field1 = "Loan Number"
        query2 = "Who is the Borrower?"
        field2 = "Borrower"
        query3 = "what is the Case Number?"
        field3 = "Case Number"
        query4 = "what is the Mortgage Identification number?"
        field4 = "MIN Number"
        query5 = "DOT signed date?"
        field5 = "Signed Date"
        query6 = "Who is the Lender?"
        field6 = "Lender"
        query7 = "what is the VA/FHA Number?"
        field7 = "VA/FHA Number"
        query8 = "Who is the Co-Borrower?"
        field8 = "Co-Borrower"
        query9 = "What is the property type - single family, multi family?"
        field9 = "Property Type"
        query10 = "what is the Property Address?"
        field10 = "Property Address"
        query11 = "In what County is the property located?"
        field11 = "Property County"
        query12 = "what is the Electronically recorded date"
        field12 = "Electronic Recording Date"

    
        
    elif document_type == "Transmittal Summary":
        #Create a list of questions around the relevant fields of a TRANSMITTAL SUMMARY document
        query1 = "Who is the Borrower?"
        field1 = "Borrower"
        query2 = "what is the Property Address?"
        field2 = "Property Address"
        query3 = "what is the Loan Term?"
        field3 = "Loan Term"
        query4 = "What is the Base Income?"
        field4 = "Base Income"
        query5 = "what is the Borrower's SSN?"
        field5 = "Borrower's SSN"
        query6 = "Who is the Co-Borrower?"
        field6 = "Co-Borrower"
        query7 = "What is the Original Loan Amount?"
        field7 = "Original Loan Amount"
        query8 = "What is the Initial P&I payment?"
        field8 = "Initial P&I payment"
        query9 = "What is the Co-Borrower's SSN?"
        field9 = "Co-Borrower’s SSN"
        query10 = "Number of units?"
        field10 = "Units#"
        query11 = "Who is the Seller?"
        field11 = "Seller"
        query12 = "Document signed date?"
        field12 = "Signed Date"



    else:
        return "Please choose your Document Type"

    queryList = [query1, query2, query3, query4, query5, query6, query7, query8, query9, query10, query11,query12]
    fieldList = [field1, field2, field3, field4, field5, field6, field7, field8, field9, field10, field11,field12]
    responseList =[]

    i = 0
    while i < len(queryList):
        question = queryList[i]
        responseList.append(pdf_qa.run(question))
        i = i+1

    return pd.DataFrame({"Field": [fieldList[0],fieldList[1],fieldList[2],fieldList[3],fieldList[4],fieldList[5],fieldList[6],fieldList[7],fieldList[8],fieldList[9],fieldList[10],fieldList[11]], 
                         "Question to gpt-4": [queryList[0],queryList[1],queryList[2],queryList[3],queryList[4],queryList[5],queryList[6],queryList[7],queryList[8],queryList[9],queryList[10],queryList[11]], 
                         "Response from gpt-4": [responseList[0],responseList[1],responseList[2],responseList[3],responseList[4],responseList[5],responseList[6],responseList[7],responseList[8],responseList[9],responseList[10],responseList[11]]})
        

    
def answer_query(query):
    question = query
    return pdf_qa.run(question)
    

css="""
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
"""

title = """
<div style="text-align: center;max-width: 700px;">
    <h1>Chatbot for PDFs - GPT-4</h1>
    <p style="text-align: center;">Upload a .PDF, click the "Upload PDF and generate embeddings" button, <br />
    Wait for the Status to show Ready. You can chose to get answers to the pre-defined question set OR ask your own question <br />
    The app is built on GPT-4 and leverages PromptTemplate</p>
</div>
"""

with gr.Blocks(css=css,theme=gr.themes.Monochrome()) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML(title)
    
    with gr.Column():
        openai_key = gr.Textbox(label="Your GPT-4 OpenAI API key", type="password")
        pdf_doc = gr.File(label="Load a pdf",file_types=['.pdf'],type='file')
        relevant_pages = gr.Textbox(label="*Optional - List comma separated page numbers to load or leave this field blank to use the entire PDF")
        
        with gr.Row():
            status = gr.Textbox(label="Status", placeholder="", interactive=False)
            load_pdf = gr.Button("Upload PDF and generate embeddings").style(full_width=False)
            
        with gr.Row():
            document_type = gr.Radio(['Deed of Trust', 'Transmittal Summary'], label="Select the Document Type")
            answers = gr.Dataframe(label="Answers to Predefined Question set")
            answers_for_predefined_question_set = gr.Button("Get gpt-4 answers to pre-defined question set").style(full_width=False)
              
        with gr.Row():
            input = gr.Textbox(label="Type in your question")
            output = gr.Textbox(label="Answer")
            submit_query = gr.Button("Submit your own question to gpt-4").style(full_width=False)
        
 
    load_pdf.click(load_pdf_and_generate_embeddings, inputs=[pdf_doc, openai_key, relevant_pages], outputs=status)

    answers_for_predefined_question_set.click(answer_predefined_questions, document_type, answers)
        
    submit_query.click(answer_query,input,output)


demo.launch()