File size: 5,424 Bytes
c191ddc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e9669e
c191ddc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e9669e
c191ddc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Fetch the Google API key from the .env file
api_key = os.getenv("GOOGLE_API_KEY")

# Set the page configuration for the Streamlit app
st.set_page_config(page_title="DocWizard Instant Insights and Analysis", layout="wide")

# Header and Instructions
st.markdown("""
## Document Intelligence Explorer 🤖

This chatbot utilizes the Retrieval-Augmented Generation (RAG) framework with Google's Generative AI model Gemini-PRO. It processes uploaded PDF documents by segmenting them into chunks, creating a searchable vector store, and generating precise answers to your questions. This method ensures high-quality, contextually relevant responses for an efficient user experience.

### How It Works

1. **Upload Your Documents**: You can upload multiple PDF files simultaneously for comprehensive analysis.
2. **Ask a Question**: After processing the documents, type your question related to the content of your uploaded documents for a detailed answer.
""")

def get_pdf_text(pdf_docs):
    """
    Extract text from uploaded PDF documents.
    """
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

def get_text_chunks(text):
    """
    Split text into manageable chunks for processing.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

def get_vector_store(text_chunks, api_key):
    """
    Create and save a FAISS vector store from text chunks.
    """
    try:
        embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
        vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
        vector_store.save_local("faiss_index")
        st.success("FAISS index created and saved successfully.")
    except Exception as e:
        st.error(f"Error creating FAISS index: {e}")

def get_conversational_chain(api_key):
    """
    Set up the conversational chain using the Gemini-PRO model.
    """
    prompt_template = """
    Answer the question as detailed as possible from the provided context. If the answer is not in the provided context, 
    say "Answer is not available in the context". Do not provide incorrect information.\n\n
    Context:\n{context}\n
    Question:\n{question}\n
    Answer:
    """
    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, google_api_key=api_key)
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
    return chain

def user_input(user_question, api_key):
    """
    Handle user input and generate a response from the chatbot.
    """
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
    
    try:
        new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
        docs = new_db.similarity_search(user_question)
        chain = get_conversational_chain(api_key)
        response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
        st.write("Reply:", response["output_text"])
    except ValueError as e:
        st.error(f"Error loading FAISS index or generating response: {e}")

def main():
    """
    Main function to run the Streamlit app.
    """
    st.header("AI Assistant 🤖")

    user_question = st.text_input("Ask a Question from the PDF Files", key="user_question")

    if st.button("Generate Text", key="generate_button"):  # Add a button to generate text
        if user_question:  # Trigger user input function only if there's a question
            with st.spinner("Generating result..."):  # Display spinner while generating
                user_input(user_question, api_key)

    with st.sidebar:
        st.title("Menu:")
        pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True, key="pdf_uploader")
        
        if st.button("Submit & Process", key="process_button"):
            if not api_key:
                st.error("Google API key is missing. Please add it to the .env file.")
                return
            
            if pdf_docs:
                with st.spinner("Processing..."):
                    raw_text = get_pdf_text(pdf_docs)
                    text_chunks = get_text_chunks(raw_text)
                    get_vector_store(text_chunks, api_key)
                    st.success("Processing complete. You can now ask questions based on the uploaded documents.")
            else:
                st.error("No PDF files uploaded. Please upload at least one PDF file to proceed.")

if __name__ == "__main__":
    main()